This is a response to Adeline document about using random forest to explore the factors related to Covid-19. We liked the ideas included and hoped to extend it by exploring specific hypothesis about the question.

Motivation: Adeline document cheeked to answer the importance of wastewater covariants by using a random forest on the difference of N1 and cases. This is mathematically equivalent on running the forest on the residuals of the model Case = avg_sars_cov2_conc. This creates a problem because this difference remains strongly correlated with both the case signal and the wastewater signal meaning that large deviance is more explained by deviance in two source vectors then in the need for another variable. Bellow shows both the correlation and scatter plots. we hope to show two rigorous hypothesis that don’t have the same limitations.

title_prep <- function(x)paste("Correlation is", round(x,5))
case_cor <- cor(data.select.1.sc$difference.log, data.select.1.sc$case_rate, use = "na.or.complete")
waste_cor <- cor(data.select.1.sc$difference.log, data.select.1.sc$avg_sars_cov2_conc, use = "na.or.complete")

{
layout(matrix(1:2, 1, 2, byrow = TRUE))
plot(x = data.select.1.sc$case_rate, y = data.select.1.sc$difference.log, 
     xlab = "case rate", ylab = "log(Covid-19 concentration) - case rate", main = title_prep(case_cor))
plot(x = log(data.select.1.sc$avg_sars_cov2_conc + 1), y = data.select.1.sc$difference.log, 
     xlab = "log(Covid-19 concentration)", ylab = "", main = title_prep(waste_cor))
}

gggg_plot <- data%>% 
  select(wwtp_name, pmmov_conc, bcov_rec_rate, average_flow_rate, ph, temperature,
         BOD5, TSS, difference.log, case_rate, avg_sars_cov2_conc)


#summary(lm(log(case_rate + 1) ~  log(avg_sars_cov2_conc + 1):wwtp_name + wwtp_name, data = data.select.1.sc))

Hypothesis one: there is a linear relationship between the log of N1 and log of cases. The covariates change the intercept of the relationship but there is always a base relationship.

A quick look at the data makes this look plausible with a thick cloud that looks like it could be explained by many perpendicular linear models. There is a sequence of points where case_rate was 0 that do not fit the trend but these can be removed or fixed using the 7 day smoothing of the case data. Furthermore the residuals look normal which helps support that conclusion.

data.select.2.sc <- data.select.1.sc
#data.select.2.sc
data.select.2.sc%>%
  ggplot(aes(x = log10(avg_sars_cov2_conc + 1), y = log10(case_rate + 1)))+
  geom_point()+
  geom_smooth(method = "lm")+
  ggtitle("strong linear relationship between variables")+
  xlab("log(Covid-19 concentration)")+
  ylab("log(case rate)")

lm_model = lm(log10(avg_sars_cov2_conc + 1) ~ log10(case_rate + 1), 
              data = data.select.2.sc, 
              na.action = na.exclude)

data.select.2.sc$resid = resid(lm_model)
  
  #(data.select.1.sc$case_rate -  coef(lm_model)[1])/data.select.1.sc$sars_cov2_adj_load

hist(data.select.2.sc$resid, main = "histogram of linear model residuals apear normal", xlab = "residuals")

base_df <- data.select.2.sc%>%
  rename(conf_case = case_rate)

Because the goal is to capture variation in wastewater that is not in cases we used the residual of the model N1 ~ cases. This method has a much smaller correlation of the two source vectors giving results much better at capturing the important covariants

resid_case_cor <- cor(data.select.2.sc$resid, 
                      data.select.2.sc$case_rate, 
                      use = "na.or.complete")
resid_waste_cor <- cor(data.select.2.sc$resid, 
                       data.select.2.sc$avg_sars_cov2_conc, 
                       use = "na.or.complete")

{
layout(matrix(1:4, 2, 2, byrow = TRUE))
plot(x = data.select.2.sc$case_rate, y = data.select.2.sc$difference.log, 
     xlab = "", ylab = "log(conc) - case rate", main = title_prep(case_cor))
plot(x = log(data.select.2.sc$avg_sars_cov2_conc), y = data.select.2.sc$difference.log, 
     xlab = "", ylab = "", main = title_prep(waste_cor))
plot(x = log(data.select.2.sc$case_rate), y = data.select.2.sc$resid, 
     xlab = "case_rate", ylab = "linear model resid", main = title_prep(resid_case_cor))
plot(x = log(data.select.2.sc$avg_sars_cov2_conc), y = data.select.2.sc$resid,
     xlab = "log(Covid-19 concentration)", ylab = "", main = title_prep(resid_waste_cor))
}

imp_ploting <- function(temp_mod){
  ImpData <- as.data.frame(randomForest::importance(temp_mod))
  ImpData$Var.Names <- row.names(ImpData)


ggplot(ImpData, aes(x=Var.Names, y=`%IncMSE`)) +
  geom_segment( aes(x=Var.Names, xend=Var.Names, y=0, yend=`%IncMSE`), color="skyblue") +
  geom_point(aes(size = IncNodePurity), color="blue", alpha=0.6) +
  theme_light() +
  coord_flip() +
  theme(
    legend.position="bottom",
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank())
}
library(randomForest)
library(randomForestExplainer)

adeline_df <- data.select.2.sc%>%
  select(-case_rate, -avg_sars_cov2_conc, - resid)
    

ade_mod <- randomForest(difference.log ~ ., data = adeline_df, ntree=100,
                    na.action = na.roughfix, importance=TRUE, keep.inbag=TRUE)

ade_mod
## 
## Call:
##  randomForest(formula = difference.log ~ ., data = adeline_df,      ntree = 100, importance = TRUE, keep.inbag = TRUE, na.action = na.roughfix) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 4771.808
##                     % Var explained: 25.45
resid_df <- data.select.2.sc%>%
  select(-case_rate, -avg_sars_cov2_conc, - difference.log)
    

resid_mod <- randomForest(resid ~ ., data = resid_df, ntree=100,
                    na.action = na.roughfix, importance=TRUE, keep.inbag=TRUE) 

resid_mod
## 
## Call:
##  randomForest(formula = resid ~ ., data = resid_df, ntree = 100,      importance = TRUE, keep.inbag = TRUE, na.action = na.roughfix) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 0.2608071
##                     % Var explained: 27.17
dumb_df <- data.select.2.sc%>%
  select(-difference.log, -avg_sars_cov2_conc, - resid)
dumb_mod <- randomForest(case_rate ~ ., data = dumb_df, ntree=100,
                    na.action = na.roughfix, importance=TRUE, keep.inbag=TRUE) 

dumb_mod
## 
## Call:
##  randomForest(formula = case_rate ~ ., data = dumb_df, ntree = 100,      importance = TRUE, keep.inbag = TRUE, na.action = na.roughfix) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 4694.437
##                     % Var explained: 27.26
imp_ploting(ade_mod)+
  ggtitle("Adeline model importance plot")

imp_ploting(dumb_mod)+
  ggtitle("case model importance plot")

imp_ploting(resid_mod)+
  ggtitle("linear residual model importance plot")

graph_df <- adeline_df%>%
  na.roughfix()
library(forestFloor)
ff = forestFloor(ade_mod, graph_df, calc_np=T)
for(i in 1:8){
  Col = fcol(ff, cols = i, outlier.lim = 2.5)
  plot(ff, col=Col, plot_GOF = T)
}
## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

graph_df2 <- resid_df%>%
  na.roughfix()

ff = forestFloor(resid_mod, graph_df2, calc_np=T)
for(i in 1:8){
  Col = fcol(ff, cols = i, outlier.lim = 2.5)
  plot(ff,col=Col, plot_GOF = T)
}
## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

## [1] "compute goodness-of-fit with leave-one-out k-nearest neighbor(guassian weighting), kknn package"

A potential use case of this analysis is using the partial difference plots as strategy to improve fitting. One issue is that a leveling of the graph could mean that the trend stops or that there is a lack of data in that area to split trees. looking at adeline plots we see a clear strong linear trend in flow. This seems to reduce variance by around 5% but makes the correlation worse. overall this is not convening result.

x = randomForest::partialPlot(ade_mod, graph_df, 
                              x.var = names(graph_df)[[4]],
                              plot = FALSE)
{
plot(x, 
     xlab = names(graph_df)[[4]], 
     ylab = "conditional change of forest model output",
     main = "flow is proportional to difference")
abline(a = -40, b = -90)
}

prop_df <- data.select.2.sc%>%
  rename(flow = diff_average_flow_rate)%>%
  mutate(flow_mod_diff = 90*flow + 40 + difference.log,
         cond_flow_mod_diff = ifelse( 0 <= flow & flow <= 1,
                              90*flow + 40 + difference.log,
                              difference.log),
         flow_mod_conc = 90*flow + 40 + log(avg_sars_cov2_conc + 1),
         cond_flow_mod_conc = ifelse( 0 <= flow & flow <= 1,
                              90*flow + 40 + log(avg_sars_cov2_conc + 1),
                              log(avg_sars_cov2_conc + 1))
         )

var_sum_df <- prop_df%>%
  summarise(name = "variance",
            'base value' = var(difference.log), 
            'linear normalization' = var(flow_mod_diff),
            'conditional normalization' = var(cond_flow_mod_diff))



cor_sum_df <- prop_df%>%
  summarise(name = "corelation",
            'base value' = cor(log(avg_sars_cov2_conc + 1), case_rate), 
            'linear normalization' = cor(flow_mod_conc, case_rate),
            'conditional normalization' = cor(cond_flow_mod_conc, case_rate))


rbind(var_sum_df, cor_sum_df)
##         name   base value linear normalization conditional normalization
## 1   variance 6403.0302626         6130.7459012              6194.9331652
## 2 corelation    0.4652264            0.2465815                 0.2373612

using the linear model residual as a target the bcov rate has a clear linear trend to try to normalize. This gives a reduction of variance of around 7% and very slightly increases the correlation. This is still small enough to not be convincing

x = randomForest::partialPlot(resid_mod, graph_df2, 
                              x.var = names(graph_df2)[[3]],
                              plot = FALSE)
{
plot(x, 
     xlab = names(graph_df2)[[3]], 
     ylab = "conditional change of forest model output",
     main = "flow is proportional to difference")
abline(a = 0, b = -.07)
}

prop_df <- data.select.2.sc%>%
  rename(bcov = diff_bcov_rec_rate)%>%
  mutate(bcov_mod_diff = .07*bcov  + resid,
         cond_bcov_mod_diff = ifelse( -2 <= bcov,
                              .07*bcov + resid,
                              resid),
         bcov_mod_conc = .07*bcov  + log(avg_sars_cov2_conc + 1),
         cond_bcov_mod_conc = ifelse( -2 <= bcov,
                              .07*bcov + log(avg_sars_cov2_conc + 1),
                              log(avg_sars_cov2_conc + 1))
         )

var_sum_df <- prop_df%>%
  summarise(name = "variance",
            'base value' = var(resid), 
            'linear normalization' = var(bcov_mod_diff),
            'conditional normalization' = var(cond_bcov_mod_diff))


cor_sum_df <- prop_df%>%
  summarise(name = "corelation",
            'base value' = cor(log(avg_sars_cov2_conc + 1), case_rate), 
            'linear normalization' = cor(bcov_mod_conc, case_rate),
            'conditional normalization' = cor(cond_bcov_mod_conc, case_rate))


rbind(var_sum_df, cor_sum_df)
##         name base value linear normalization conditional normalization
## 1   variance  0.3582326            0.3341552                 0.3349880
## 2 corelation  0.4652264            0.4690614                 0.4723519
Given this conclusion we can fit these residuals into a random forest model to find what the intercept should be.


This gives good results. but its assumption of a constant ratio does not match with what we expect of reality. depending on these factors we expect both a change in trend and a change in slope. This brings us to:
s
hypothesis 2: there is a linear relationship between N1 and cases. the covariates change the intercept and slope of the relationship.
data(WasteWater_data, package = "DSIWastewater")
reg_df <- WasteWater_data%>%
  select(regions, site)%>%
  rename(wwtp_name = site)%>%
  distinct()


rand_tree_df <- data.select.2.sc%>%
  left_join(reg_df)%>%
  select(-resid, - wwtp_name)%>%#,  -date, -tests
  mutate(regions = as.factor(regions))%>%
  na.roughfix()%>%
  filter(if_all(everything(), is.finite)) # N1 + N2

  
  
Form <- case_rate ~ avg_sars_cov2_conc| . - avg_sars_cov2_conc #sars_cov2_adj_load
library(rsample)
forest_model <- random_linear_forest(data = rand_tree_df,
                                     num_tree = 20, 
                  model_formula = Form,
                                     max_depth = Inf,
                  verbose = FALSE)

pred = predict(forest_model, rand_tree_df)
resid = rand_tree_df$conf_case - pred
oob_error_df <- OOB_MSE_num_trees(forest_model)

{
layout(matrix(1:4, 2, 2, byrow = TRUE))
plot(oob_error_df$num_tree, oob_error_df$model_MSE, type = "l")
hist(resid)
plot(resid, pred)
plot(x = 0:10, y = 0:10, ann = F,bty = "n",type = "n",
     xaxt = "n", yaxt = "n")
 
text(x = 5, y = 5, paste(show(forest_model), collapse="\n"), cex = .7)
}
#num_var = length(graph_df) - 1
#par(mfrow=c(3,3))
#for(i in 1:(num_var)) randomForest::partialPlot(resid_mod, graph_df2, 
#                                                x.var = names(graph_df2)[[i]],
#                                                main = names(graph_df2)[[i]])

#x = randomForest::partialPlot(resid_mod, graph_df2, 
#                                                x.var = names(graph_df2)[[3]],
#                                                xlab = names(graph_df2)[[3]])
#par(mfrow=c(1,1))
inc_MSE <- gen_INCMSE(forest_model)
#reran model
#or


ggplot(inc_MSE, aes(x = var, y = incMSE)) +
  geom_segment( aes(x=var, xend=var, y = 0, yend=incMSE), color="skyblue") +
  geom_point(aes(size = old_incMSE), color="blue", alpha=0.6) +
  theme_light() +
  coord_flip() +
  theme(
    legend.position="bottom",
    panel.grid.major.y = element_blank(),
    panel.border = element_blank(),
    axis.ticks.y = element_blank()
)
inc_MSE

rand_tree_df$pred <- exp(predict(forest_model, forest_model@data))
rand_tree_df2 <- rand_tree_df%>%
  select(regions, date, pred)

B <- rand_tree_df%>%
  ggplot(aes(x = date))+
  geom_line(aes(y = log(pred), color = "red"))+
  geom_line(aes(y = conf_case))
ggplotly(B)
CaseRegressionOutput <- buildRegressionEstimateTable(DataMod = rand_tree_df2, 
    RunOn = "pred",
    SplitOn = "regions", DaysRegressed = 7)%>%
  rename(site = regions)

CaseFlags2 <- CaseRegressionOutput%>%
  mutate(flag = as.numeric(lmreg_slope > 1))%>%
  select(date, flag)


CaseFlags2%>%
  select(-date)%>%
  summarise(s = sum(flag))

load("CaseFlags.Rdata")
CaseFlags%>%
  select(-date)%>%
  mutate(across(case_flag_Cases:slope_switch_flag_7DayCases, ~ ifelse(is.na(.x),0,.x)))%>%
  summarise(across(case_flag_Cases:slope_switch_flag_7DayCases, sum))

CaseFlags <- CaseFlags%>%
  select(date, case_flag_Cases)

full_data <- inner_join(CaseFlags, 
                       mutate(CaseFlags2,date = as.Date(date, lubridate::mdy("1/1/1970"))))%>%
  DF_date_vector("date", c("case_flag_Cases", "flag"))



mean(date_distance_calc(full_data, "case_flag_Cases", "flag")$flag**2, na.rm = TRUE)


old_df <- date_Flag_DF%>%
  select(date, flag.ntile_90_0.9)

A <- full_data%>%
  ggplot(aes(x = date))+
  geom_point(aes(y = conf_case, x = as.Date(date, lubridate::mdy("1/1/1970"))), data = base_df)+
  geom_point(aes(y = 1.25 + as.numeric(flag)/10000, color = "tree"))+
  geom_point(aes(y = 1 + as.numeric(case_flag_Cases)/10000, color = "case"))+
geom_point(aes(y = 1.5 + as.numeric(flag.ntile_90_0.9)/10000, color = "wastewater"), data = old_df)
library(plotly)
ggplotly(A)